{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "cd7996c5-91eb-41de-ac6a-0b22462b76f1",
   "metadata": {},
   "source": [
    "# Lesson 4: Auto-merging Retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8f0928d-4505-437c-9ee8-bada425a5e22",
   "metadata": {
    "height": 47
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20373372-03c5-41e9-8bc9-dfa6ce35d666",
   "metadata": {
    "height": 98
   },
   "outputs": [],
   "source": [
    "import utils\n",
    "\n",
    "import os\n",
    "import openai\n",
    "openai.api_key = utils.get_openai_api_key()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f58fc195-9619-4088-831e-66b3cc6e0425",
   "metadata": {
    "height": 98,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index import SimpleDirectoryReader\n",
    "\n",
    "documents = SimpleDirectoryReader(\n",
    "    input_files=[\"./eBook-How-to-Build-a-Career-in-AI.pdf\"]\n",
    ").load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89ad555b-48d4-441b-b5f3-79a685b4c3f2",
   "metadata": {
    "height": 81,
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(type(documents), \"\\n\")\n",
    "print(len(documents), \"\\n\")\n",
    "print(type(documents[0]))\n",
    "print(documents[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c143850-72c6-4a09-9fb0-a216108314c0",
   "metadata": {},
   "source": [
    "## Auto-merging retrieval setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8926348c-a4c5-471b-99ed-e6bf170f1fe1",
   "metadata": {
    "height": 64,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index import Document\n",
    "\n",
    "document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "202ccfb5-cf94-48a4-b23b-8c5a5ab73ebc",
   "metadata": {
    "height": 115,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index.node_parser import HierarchicalNodeParser\n",
    "\n",
    "# create the hierarchical node parser w/ default settings\n",
    "node_parser = HierarchicalNodeParser.from_defaults(\n",
    "    chunk_sizes=[2048, 512, 128]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "672fd574-6b3c-43ad-9a9a-2db423cadd48",
   "metadata": {
    "height": 30,
    "tags": []
   },
   "outputs": [],
   "source": [
    "nodes = node_parser.get_nodes_from_documents([document])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83cfc301-1887-4b1c-a8aa-1fb55e30be78",
   "metadata": {
    "height": 81,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index.node_parser import get_leaf_nodes\n",
    "\n",
    "leaf_nodes = get_leaf_nodes(nodes)\n",
    "print(leaf_nodes[30].text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0703abeb-3bcc-4dd6-8010-3bcf95d8b483",
   "metadata": {
    "height": 81,
    "tags": []
   },
   "outputs": [],
   "source": [
    "nodes_by_id = {node.node_id: node for node in nodes}\n",
    "\n",
    "parent_node = nodes_by_id[leaf_nodes[30].parent_node.node_id]\n",
    "print(parent_node.text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6bafef6-02f6-4029-a5eb-dfbc4369da56",
   "metadata": {},
   "source": [
    "### Building the index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b0350cd-392e-45e9-8664-ff22a39a8bcd",
   "metadata": {
    "height": 64,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index.llms import OpenAI\n",
    "\n",
    "llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03213595-6581-47f5-90a7-cfa381d42872",
   "metadata": {
    "height": 132,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index import ServiceContext\n",
    "\n",
    "auto_merging_context = ServiceContext.from_defaults(\n",
    "    llm=llm,\n",
    "    embed_model=\"local:BAAI/bge-small-en-v1.5\",\n",
    "    node_parser=node_parser,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0f84886-e1aa-40f1-a812-4b6f8d0204af",
   "metadata": {
    "height": 200,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index import VectorStoreIndex, StorageContext\n",
    "\n",
    "storage_context = StorageContext.from_defaults()\n",
    "storage_context.docstore.add_documents(nodes)\n",
    "\n",
    "automerging_index = VectorStoreIndex(\n",
    "    leaf_nodes, storage_context=storage_context, service_context=auto_merging_context\n",
    ")\n",
    "\n",
    "automerging_index.storage_context.persist(persist_dir=\"./merging_index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a8ddbff-6fdd-448d-9ce8-53ed782a6d66",
   "metadata": {
    "height": 455,
    "tags": []
   },
   "outputs": [],
   "source": [
    "# This block of code is optional to check\n",
    "# if an index file exist, then it will load it\n",
    "# if not, it will rebuild it\n",
    "\n",
    "import os\n",
    "from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage\n",
    "from llama_index import load_index_from_storage\n",
    "\n",
    "if not os.path.exists(\"./merging_index\"):\n",
    "    storage_context = StorageContext.from_defaults()\n",
    "    storage_context.docstore.add_documents(nodes)\n",
    "\n",
    "    automerging_index = VectorStoreIndex(\n",
    "            leaf_nodes,\n",
    "            storage_context=storage_context,\n",
    "            service_context=auto_merging_context\n",
    "        )\n",
    "\n",
    "    automerging_index.storage_context.persist(persist_dir=\"./merging_index\")\n",
    "else:\n",
    "    automerging_index = load_index_from_storage(\n",
    "        StorageContext.from_defaults(persist_dir=\"./merging_index\"),\n",
    "        service_context=auto_merging_context\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2710c689-0596-45c5-a63f-1e8bb9481846",
   "metadata": {},
   "source": [
    "### Defining the retriever and running the query engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d443d25a-2a5e-425c-9cca-4bf8503c6db3",
   "metadata": {
    "height": 353,
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index.indices.postprocessor import SentenceTransformerRerank\n",
    "from llama_index.retrievers import AutoMergingRetriever\n",
    "from llama_index.query_engine import RetrieverQueryEngine\n",
    "\n",
    "automerging_retriever = automerging_index.as_retriever(\n",
    "    similarity_top_k=12\n",
    ")\n",
    "\n",
    "retriever = AutoMergingRetriever(\n",
    "    automerging_retriever, \n",
    "    automerging_index.storage_context, \n",
    "    verbose=True\n",
    ")\n",
    "\n",
    "rerank = SentenceTransformerRerank(top_n=6, model=\"BAAI/bge-reranker-base\")\n",
    "\n",
    "auto_merging_engine = RetrieverQueryEngine.from_args(\n",
    "    automerging_retriever, node_postprocessors=[rerank]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c458d0ac-24a0-430b-acc4-c05252134527",
   "metadata": {
    "height": 64,
    "tags": []
   },
   "outputs": [],
   "source": [
    "auto_merging_response = auto_merging_engine.query(\n",
    "    \"What is the importance of networking in AI?\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "569e70b4-daae-4181-a953-7ea37f4620c7",
   "metadata": {
    "height": 64
   },
   "outputs": [],
   "source": [
    "from llama_index.response.notebook_utils import display_response\n",
    "\n",
    "display_response(auto_merging_response)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a841036a-0bb6-4037-920e-ae555748e111",
   "metadata": {},
   "source": [
    "## Putting it all Together"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8949da64-bc7f-4929-b394-51d11732e62f",
   "metadata": {
    "height": 1101
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "from llama_index import (\n",
    "    ServiceContext,\n",
    "    StorageContext,\n",
    "    VectorStoreIndex,\n",
    "    load_index_from_storage,\n",
    ")\n",
    "from llama_index.node_parser import HierarchicalNodeParser\n",
    "from llama_index.node_parser import get_leaf_nodes\n",
    "from llama_index import StorageContext, load_index_from_storage\n",
    "from llama_index.retrievers import AutoMergingRetriever\n",
    "from llama_index.indices.postprocessor import SentenceTransformerRerank\n",
    "from llama_index.query_engine import RetrieverQueryEngine\n",
    "\n",
    "\n",
    "def build_automerging_index(\n",
    "    documents,\n",
    "    llm,\n",
    "    embed_model=\"local:BAAI/bge-small-en-v1.5\",\n",
    "    save_dir=\"merging_index\",\n",
    "    chunk_sizes=None,\n",
    "):\n",
    "    chunk_sizes = chunk_sizes or [2048, 512, 128]\n",
    "    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)\n",
    "    nodes = node_parser.get_nodes_from_documents(documents)\n",
    "    leaf_nodes = get_leaf_nodes(nodes)\n",
    "    merging_context = ServiceContext.from_defaults(\n",
    "        llm=llm,\n",
    "        embed_model=embed_model,\n",
    "    )\n",
    "    storage_context = StorageContext.from_defaults()\n",
    "    storage_context.docstore.add_documents(nodes)\n",
    "\n",
    "    if not os.path.exists(save_dir):\n",
    "        automerging_index = VectorStoreIndex(\n",
    "            leaf_nodes, storage_context=storage_context, service_context=merging_context\n",
    "        )\n",
    "        automerging_index.storage_context.persist(persist_dir=save_dir)\n",
    "    else:\n",
    "        automerging_index = load_index_from_storage(\n",
    "            StorageContext.from_defaults(persist_dir=save_dir),\n",
    "            service_context=merging_context,\n",
    "        )\n",
    "    return automerging_index\n",
    "\n",
    "\n",
    "def get_automerging_query_engine(\n",
    "    automerging_index,\n",
    "    similarity_top_k=12,\n",
    "    rerank_top_n=6,\n",
    "):\n",
    "    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)\n",
    "    retriever = AutoMergingRetriever(\n",
    "        base_retriever, automerging_index.storage_context, verbose=True\n",
    "    )\n",
    "    rerank = SentenceTransformerRerank(\n",
    "        top_n=rerank_top_n, model=\"BAAI/bge-reranker-base\"\n",
    "    )\n",
    "    auto_merging_engine = RetrieverQueryEngine.from_args(\n",
    "        retriever, node_postprocessors=[rerank]\n",
    "    )\n",
    "    return auto_merging_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7f52577-2e8e-4fee-ade9-2f5ce96a7a8c",
   "metadata": {
    "height": 149
   },
   "outputs": [],
   "source": [
    "from llama_index.llms import OpenAI\n",
    "\n",
    "index = build_automerging_index(\n",
    "    [document],\n",
    "    llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1),\n",
    "    save_dir=\"./merging_index\",\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c753885f-bfaf-4ac0-8410-b62f619827c4",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "query_engine = get_automerging_query_engine(index, similarity_top_k=6)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9130e35d-bc09-4e4d-bbd9-61c9d6847110",
   "metadata": {},
   "source": [
    "## TruLens Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf034319-2f0a-43ac-896b-2aef1762b3bd",
   "metadata": {
    "height": 64
   },
   "outputs": [],
   "source": [
    "from trulens_eval import Tru\n",
    "\n",
    "Tru().reset_database()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76e8c8b7-8d47-45cb-94f3-70c2d75c9500",
   "metadata": {},
   "source": [
    "### Two layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fd32404-2760-420d-b406-61d50c5ae7de",
   "metadata": {
    "height": 132
   },
   "outputs": [],
   "source": [
    "auto_merging_index_0 = build_automerging_index(\n",
    "    documents,\n",
    "    llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1),\n",
    "    embed_model=\"local:BAAI/bge-small-en-v1.5\",\n",
    "    save_dir=\"merging_index_0\",\n",
    "    chunk_sizes=[2048,512],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad319255-b86a-4d05-8dd2-a8fa02d15ed0",
   "metadata": {
    "height": 98
   },
   "outputs": [],
   "source": [
    "auto_merging_engine_0 = get_automerging_query_engine(\n",
    "    auto_merging_index_0,\n",
    "    similarity_top_k=12,\n",
    "    rerank_top_n=6,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c669cee9-7be9-4a4d-8c17-1672bb9c3abf",
   "metadata": {
    "height": 115
   },
   "outputs": [],
   "source": [
    "from utils import get_prebuilt_trulens_recorder\n",
    "\n",
    "tru_recorder = get_prebuilt_trulens_recorder(\n",
    "    auto_merging_engine_0,\n",
    "    app_id ='app_0'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad216bb7-100e-455f-8c33-aa68b3e42026",
   "metadata": {
    "height": 115
   },
   "outputs": [],
   "source": [
    "eval_questions = []\n",
    "with open('generated_questions.text', 'r') as file:\n",
    "    for line in file:\n",
    "        # Remove newline character and convert to integer\n",
    "        item = line.strip()\n",
    "        eval_questions.append(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d471cb1d-be4e-423b-a5a5-f216cbd8c9ff",
   "metadata": {
    "height": 81
   },
   "outputs": [],
   "source": [
    "def run_evals(eval_questions, tru_recorder, query_engine):\n",
    "    for question in eval_questions:\n",
    "        with tru_recorder as recording:\n",
    "            response = query_engine.query(question)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eea6f236-fe6a-470f-82b5-1a1e3d2593ce",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "run_evals(eval_questions, tru_recorder, auto_merging_engine_0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54cc6991-e743-4e33-9e22-414362a6aae0",
   "metadata": {
    "height": 64
   },
   "outputs": [],
   "source": [
    "from trulens_eval import Tru\n",
    "\n",
    "Tru().get_leaderboard(app_ids=[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f204af1a-ab10-4e50-b2d6-ba78e2a66e90",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "Tru().run_dashboard()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c36721ff-9fe4-43e9-b7ed-435f6d054e29",
   "metadata": {},
   "source": [
    "### Three layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a575af9-37b8-442a-b6f2-3ed04bf742f1",
   "metadata": {
    "height": 132
   },
   "outputs": [],
   "source": [
    "auto_merging_index_1 = build_automerging_index(\n",
    "    documents,\n",
    "    llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1),\n",
    "    embed_model=\"local:BAAI/bge-small-en-v1.5\",\n",
    "    save_dir=\"merging_index_1\",\n",
    "    chunk_sizes=[2048,512,128],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0493fb6-3a5b-4d14-81ba-f26d9e4d46b3",
   "metadata": {
    "height": 115
   },
   "outputs": [],
   "source": [
    "auto_merging_engine_1 = get_automerging_query_engine(\n",
    "    auto_merging_index_1,\n",
    "    similarity_top_k=12,\n",
    "    rerank_top_n=6,\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3198fa2-6a81-4edd-a0c8-843c3838f56f",
   "metadata": {
    "height": 81
   },
   "outputs": [],
   "source": [
    "tru_recorder = get_prebuilt_trulens_recorder(\n",
    "    auto_merging_engine_1,\n",
    "    app_id ='app_1'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5063c989-9f96-4bfa-bb50-fa9dbf6d576c",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "run_evals(eval_questions, tru_recorder, auto_merging_engine_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c26b5c94-16ce-415d-93ca-a131b05f903f",
   "metadata": {
    "height": 64
   },
   "outputs": [],
   "source": [
    "from trulens_eval import Tru\n",
    "\n",
    "Tru().get_leaderboard(app_ids=[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3988b3f8-bd4c-41e5-be50-e90b8366f4bd",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "Tru().run_dashboard()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
